1 Plot multiple lines by base and ggplot2

df <- readr::read_tsv("data/EV_TP_10_16.csv")
## Parsed with column specification:
## cols(
##   就診年週 = col_integer(),
##   `2010` = col_integer(),
##   `2011` = col_integer(),
##   `2012` = col_integer(),
##   `2013` = col_integer(),
##   `2014` = col_integer(),
##   `2015` = col_integer(),
##   `2016` = col_integer()
## )
df <- df[-1]
df <- df[-nrow(df), ]
names(df)
## [1] "2010" "2011" "2012" "2013" "2014" "2015" "2016"
head(df)
## # A tibble: 6 x 7
##   `2010` `2011` `2012` `2013` `2014` `2015` `2016`
##    <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1     86     57    183    114    135     50    119
## 2     92     55    210    100    106     39    101
## 3     80     51    211    110     79     52    114
## 4     87     45    444    106     82     75     94
## 5     90     73    116     86    140     64     80
## 6    105     49    103     99     95     48    245

1.1 Plot by matplot()

matplot(df, type="l", col = 1:7)
legend("topleft", legend = names(df), col=names(df), pch=1)

1.2 Plot by ggplot2

library(ggplot2)
weeks <- 1:52
ggplot(df, aes(x=weeks)) +
  geom_line(aes(y = `2010`), color="grey") + 
  geom_line(aes(y = `2011`), color = "black") + 
  geom_line(aes(y = `2012`), color = "blue") + 
  geom_line(aes(y = `2013`), color = "green") + 
  geom_line(aes(y = `2014`), color = "cyan") + 
  geom_line(aes(y = `2015`), color = "yellow") + 
  geom_line(aes(y = `2016`), color = "red") + 
  ylab(label="Number of infected cases") + 
  xlab("Week")

2 Tsai’s fan page as the case

2.1 Loading data

library(tidyverse)
library(ggplot2)
library(dplyr)
post <- readRDS("data/posts_tsai.rds")
# sum(is.na(post$shares_count)) # 947
# sum(is.na(post$likes_count)) # 0
# sum(is.na(post$comments_count)) # 0

2.2 filter 2016 posts by year

  • Filter posts from 2016-01-01 to 2017-01-01
  • Create new variables hour, month, and week.
post2016 <- post[post$created_time >= as.POSIXct('2016-01-01'), ]
## Warning in strptime(xx, f <- "%Y-%m-%d %H:%M:%OS", tz = tz): unknown
## timezone 'zone/tz/2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- post2016[post2016$created_time < as.POSIXct('2017-01-01'),]
post2016 <- post

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
post2016$hour <- hour(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$mounth <- month(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$week <- week(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$week <- wday(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
summary(post2016)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
##   created_time                      id           
##  Min.   :2008-10-22 21:55:20   Length:4112       
##  1st Qu.:2010-11-24 13:59:31   Class :character  
##  Median :2011-12-28 09:25:28   Mode  :character  
##  Mean   :2012-08-26 13:05:56                     
##  3rd Qu.:2013-12-25 17:06:44                     
##  Max.   :2017-01-28 13:10:00                     
##                                                  
##   updated_time                     type             message         
##  Min.   :2008-11-13 13:19:43   Length:4112        Length:4112       
##  1st Qu.:2011-12-08 10:29:49   Class :character   Class :character  
##  Median :2012-01-01 14:07:49   Mode  :character   Mode  :character  
##  Mean   :2012-12-08 21:47:22                                        
##  3rd Qu.:2014-07-27 13:21:22                                        
##  Max.   :2017-05-31 00:09:30                                        
##                                                                     
##   shares_count      likes_count       comments_count         hour      
##  Min.   :    1.0   Min.   :     0.0   Min.   :    0.0   Min.   : 0.00  
##  1st Qu.:   11.5   1st Qu.:    82.8   1st Qu.:    3.0   1st Qu.:10.00  
##  Median :  227.0   Median :   394.0   Median :   39.0   Median :14.00  
##  Mean   :  386.1   Mean   : 12955.1   Mean   :  360.9   Mean   :13.14  
##  3rd Qu.:  442.0   3rd Qu.: 19995.0   3rd Qu.:  395.2   3rd Qu.:17.00  
##  Max.   :12015.0   Max.   :358502.0   Max.   :38298.0   Max.   :23.00  
##  NA's   :1565                                                          
##      mounth            week      
##  Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 6.000   1st Qu.:3.000  
##  Median : 9.000   Median :4.000  
##  Mean   : 8.373   Mean   :4.196  
##  3rd Qu.:12.000   3rd Qu.:6.000  
##  Max.   :12.000   Max.   :7.000  
## 

2.3 Filter year 2016 posts by dplyr and magrittr

post2016 <- filter(post, created_time > as.POSIXlt('2016-01-01'))
post2016 <- filter(post2016, created_time < as.POSIXlt('2017-01-01'))
post2016 <- mutate(post2016, hour = hour(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- mutate(post2016, month = month(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- mutate(post2016, week = week(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- post %>%
    filter(created_time > as.POSIXlt('2016-01-01')) %>%
    filter(created_time < as.POSIXlt('2017-01-01')) %>%
    mutate(hour = hour(post2016$created_time)) %>%
    mutate(month = month(post2016$created_time)) %>%
    mutate(week = week(post2016$created_time))
summary(post2016)
##   created_time                      id           
##  Min.   :2016-01-01 12:00:02   Length:489        
##  1st Qu.:2016-02-06 05:58:53   Class :character  
##  Median :2016-05-10 17:33:50   Mode  :character  
##  Mean   :2016-05-21 23:50:35                     
##  3rd Qu.:2016-08-20 10:20:51                     
##  Max.   :2016-12-31 14:18:02                     
##                                                  
##   updated_time                     type             message         
##  Min.   :2016-01-01 16:00:01   Length:489         Length:489        
##  1st Qu.:2016-05-12 01:10:41   Class :character   Class :character  
##  Median :2016-08-17 14:32:46   Mode  :character   Mode  :character  
##  Mean   :2016-08-30 22:01:45                                        
##  3rd Qu.:2017-01-23 18:04:58                                        
##  Max.   :2017-05-31 00:09:30                                        
##                                                                     
##   shares_count     likes_count     comments_count       hour      
##  Min.   :   1.0   Min.   :   278   Min.   :   12   Min.   : 0.00  
##  1st Qu.: 314.0   1st Qu.: 19208   1st Qu.:  699   1st Qu.:12.00  
##  Median : 471.5   Median : 28359   Median :  973   Median :15.00  
##  Mean   : 807.9   Mean   : 38467   Mean   : 1572   Mean   :14.85  
##  3rd Qu.: 817.5   3rd Qu.: 43034   3rd Qu.: 1493   3rd Qu.:18.00  
##  Max.   :8613.0   Max.   :358502   Max.   :38298   Max.   :23.00  
##  NA's   :11                                                       
##      month             week      
##  Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000   1st Qu.: 6.00  
##  Median : 5.000   Median :19.00  
##  Mean   : 5.157   Mean   :20.78  
##  3rd Qu.: 8.000   3rd Qu.:34.00  
##  Max.   :12.000   Max.   :53.00  
## 
post2016 <- post %>%
    # filter(created_time > as.POSIXlt('2016-01-01'), 
    #      created_time < as.POSIXlt('2017-01-01')) %>%
    mutate(hour = hour(created_time), 
           month = month(created_time), 
           week = week(created_time),
           wday = wday(created_time),
           year = year(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'

## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'

## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'

## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'

## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
summary(post2016)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
##   created_time                      id           
##  Min.   :2008-10-22 21:55:20   Length:4112       
##  1st Qu.:2010-11-24 13:59:31   Class :character  
##  Median :2011-12-28 09:25:28   Mode  :character  
##  Mean   :2012-08-26 13:05:56                     
##  3rd Qu.:2013-12-25 17:06:44                     
##  Max.   :2017-01-28 13:10:00                     
##                                                  
##   updated_time                     type             message         
##  Min.   :2008-11-13 13:19:43   Length:4112        Length:4112       
##  1st Qu.:2011-12-08 10:29:49   Class :character   Class :character  
##  Median :2012-01-01 14:07:49   Mode  :character   Mode  :character  
##  Mean   :2012-12-08 21:47:22                                        
##  3rd Qu.:2014-07-27 13:21:22                                        
##  Max.   :2017-05-31 00:09:30                                        
##                                                                     
##   shares_count      likes_count       comments_count         hour      
##  Min.   :    1.0   Min.   :     0.0   Min.   :    0.0   Min.   : 0.00  
##  1st Qu.:   11.5   1st Qu.:    82.8   1st Qu.:    3.0   1st Qu.:10.00  
##  Median :  227.0   Median :   394.0   Median :   39.0   Median :14.00  
##  Mean   :  386.1   Mean   : 12955.1   Mean   :  360.9   Mean   :13.14  
##  3rd Qu.:  442.0   3rd Qu.: 19995.0   3rd Qu.:  395.2   3rd Qu.:17.00  
##  Max.   :12015.0   Max.   :358502.0   Max.   :38298.0   Max.   :23.00  
##  NA's   :1565                                                          
##      month             week           wday            year     
##  Min.   : 1.000   Min.   : 1.0   Min.   :1.000   Min.   :2008  
##  1st Qu.: 6.000   1st Qu.:23.0   1st Qu.:3.000   1st Qu.:2010  
##  Median : 9.000   Median :39.0   Median :4.000   Median :2011  
##  Mean   : 8.373   Mean   :34.8   Mean   :4.196   Mean   :2012  
##  3rd Qu.:12.000   3rd Qu.:50.0   3rd Qu.:6.000   3rd Qu.:2013  
##  Max.   :12.000   Max.   :53.0   Max.   :7.000   Max.   :2017  
## 

3 Summarize data

3.1 summarize by tapply(), aggregate()

likes_byhour <- with(post2016, tapply(likes_count, hour, mean))
comments_byhour <- with(post2016, tapply(comments_count, hour, mean))

summary <- aggregate(cbind(likes_count, comments_count, shares_count)~hour, data=post2016, mean, na.rm=TRUE)

summary <- aggregate(cbind(likes_count, comments_count, shares_count)~hour+week, data=post2016, sum, na.rm=TRUE)

3.2 summarize by dplyr::summarize()

summary <- post2016 %>%
    group_by(hour) %>%
    summarize(
        n = n(),
        mean_comment = mean(comments_count, na.rm = T), 
        mean_share = mean(shares_count, na.rm = T),
        mean_like = mean(likes_count, na.rm = T),
        sum_comment = sum(comments_count, na.rm = T), 
        sum_share = sum(shares_count, na.rm = T),
        sum_like = sum(likes_count, na.rm = T),
        sd_comment = sd(comments_count, na.rm = T), 
        sd_share = sd(shares_count, na.rm = T),
        sd_like = sd(likes_count, na.rm = T)
    )

4 data conversion

library(tidyr)
t1 <- count(post2016, hour, wday)
t2 <- spread(t1, wday, n, fill=0)
t3 <- right_join(t2, data.frame(hour=0:23))
## Joining, by = "hour"
t3[is.na(t3)] <- 0

myt <- post2016 %>%
    count(hour, wday) %>%
    spread(wday, n, fill=0) %>%
    right_join(data.frame(hour=0:23)) %>%
    mutate_all(funs(ifelse(is.na(.), 0, .))) %>%
    gather(wday, n, -hour)
## Joining, by = "hour"

5 Plotting by plot{base}

5.1 plot() scatter

post2016 <- post2016 %>%
    mutate(nchar = ifelse(is.na(message), 0, log(nchar(message))))
plot(post2016$nchar, post2016$hour, xlab='nchar', ylab='time(hour)') 

# pch=1: label style; cex=1: label size
myScatter<- function(x, y, xlab, ylab, ylim){
  plot(y~x, xlab=xlab, ylab=ylab, pch=1, ylim = c(1, ylim))
  abline(lm(y~x), col="red") # regression line (y~x)
}
par(mfrow=c(3, 1), mai=c(0.3, 0.3, 0.3, 0.3))
with(post2016, myScatter(hour, comments_count, 'comments', 'hour', 5000))
with(post2016, myScatter(hour, shares_count, 'shares', 'hour', 5000))
with(post2016, myScatter(hour, likes_count, 'likes', 'hour', 100000))

5.2 Assigning color by groups

selected <- post2016 %>%
    select(likes_count, comments_count) %>%
    na.omit()

# k-mean cluster
cres <- kmeans(selected, 3)
# cres$centers
# cres$cluster
colors <- c("#FF0000", "#00FF00", "#0000FF")
selected$color <- colors[cres$cluster]

plot(log(selected$likes_count), log(selected$comments_count), col=selected$color)

6 Plotting by qplot of ggplot2

library(ggplot2)

7 ggplot2

7.1 Scatter

  • Original qplot is qplot(comments_count, likes_count, color = type, data = post2016, geom = "jitter", alpha=I(0.2))
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

post2016 %>%
    ggplot() +
    aes(log(likes_count), year, color = factor(type)) +
    geom_point(alpha = 0.5) +
    geom_jitter() +
    scale_colour_manual(values=cbPalette)

# post2016 %>%
#       mutate(hour = hour(created_time), 
#          month = month(created_time), 
#          week = week(created_time),
#          wday = wday(created_time),
#          year = year(created_time)) %>%
#   ggplot() + 
#   aes(year, month) + 
#   geom_point(alpha = 0.5)



post2016 %>%
    ggplot() + 
    aes(year, month) + 
    geom_point(alpha = 0.5)

post2016 %>%
    ggplot(aes(year)) + 
    geom_bar(bin_width=0.1)
## Warning: Ignoring unknown parameters: bin_width

7.2 Bar chart

  • geom_bar uses stat_count by default: it counts the number of cases at each x position.
  • geom_col uses stat_identity: it leaves the data as is.
post2016 %>%
    ggplot() +
    aes(hour) + 
    geom_bar(fill = "red")

post2016 %>%
    count(hour) %>%
    ggplot() +
    aes(hour, n) + 
    geom_bar(stat = "identity", fill = "red") + 
    geom_smooth()
## `geom_smooth()` using method = 'loess'

post2016 %>%
    count(hour) %>%
    ggplot() +
    aes(hour, n) + 
    geom_col(fill = "red") + 
    geom_smooth()
## `geom_smooth()` using method = 'loess'

post2016 %>%
    mutate(nchar = ifelse(is.na(message), 0, nchar(message))) %>%
    ggplot(aes(hour, nchar, color=factor(type))) +
    geom_point(alpha = 0.1) + 
    facet_grid(type ~ .) + 
    geom_smooth(method = "lm") + 
    geom_jitter() + 
    scale_colour_brewer(palette="Spectral")

post2016 %>%
    ggplot(aes(week, shares_count, color=factor(type))) +
    geom_point(alpha = 0.2) + 
    facet_grid(. ~ type) + 
    geom_smooth() + 
    geom_jitter() + 
    scale_colour_manual(values = c("gray", "red", "gray", "gray", "gray", "gray"))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1565 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 1565 rows containing missing values (geom_point).

## Warning: Removed 1565 rows containing missing values (geom_point).

8  qplot: Relationship between shares and likes

8.1 qplot density function of different types

qplot(likes_count, data = post2016, geom = "density")

qplot(likes_count, data = post2016, geom="density", color = type)

8.2 qplot histogram

qplot(likes_count, data = post2016, geom="histogram", fill = type)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(likes_count, data = post2016, fill = type)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(comments_count, data = post2016, fill = type)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

8.3 qplot: likes count distribution categorized by types

qplot(likes_count, data = post2016, color = factor(type), facets = type ~ ., binwidth = 1000)

qplot(likes_count, data = post2016, color = factor(type), facets = . ~ type, binwidth = 1000)

qplot(comments_count, likes_count, color = type, data = post2016, geom = "jitter", alpha=I(0.2))

qplot(log(likes_count), hour, color = type, data = post2016, geom = "jitter", alpha=I(0.5))

## qplot with trend curve

qplot(comments_count, likes_count, color = type, data = post2016, geom = c("point", "smooth"))
## `geom_smooth()` using method = 'gam'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

qplot(comments_count, likes_count, color = type, data = post2016, geom = c("point", "smooth"), method="lm")
## Warning: Ignoring unknown parameters: method

8.4 qplot: Relationship between comments and likes counts, categorized by types

qplot(comments_count, likes_count, data = post2016, facets = . ~ type)

## qplot: comments count distribution categorized by types

qplot(comments_count, likes_count, data = post2016, color=factor(type), facets = type ~ .)